#########################
# Martin Vinæs Larsen   
# Jacob Nyrup           
# Michael Bang Petersen 
#
# Replication R command file for:
# Do Survey Estimates of the Public’s Compliance with COVID-19 Regulations Suffer from Social Desirability Bias?,
# Journal of Behavioral Public Administration
#
# Created May 2020
#
#########################

###################
## Load packages ##
###################

pacman::p_load(here,tidyverse,openxlsx,stringr,rdrobust,interplot,patchwork,
               broom,haven,labelled,GGally,ggthemes,estimatr,wesanderson,
               rddensity,ggthemes,skimr,gganimate,dplyr,list,Rmisc,extrafont,cowplot,
               stargazer,qwraps2,TOSTER,rsample,misreport)

###################
#### Load data ####
###################

# Load Petersen df

df_p <-  readRDS("df_petersen.rds")

# Load Nyrup & Vinæs df

df_nv <- read_dta("../../Data/SocialDistance.dta") %>% 
               mutate(Q3_Version = dplyr::recode(Q3_Version,`2`=0),
                      Q4_Version = dplyr::recode(Q4_Version,`2`=0),
                      Q5_Version = dplyr::recode(Q5_Version,`2`=0),
                      Q6_1_resp_bin = dplyr::recode(Q6_1_resp,`2`=0,`3`=0),
                      Q6_2_resp_bin = dplyr::recode(Q6_2_resp,`2`=0,`3`=0),
                      Q6_3_resp_bin = dplyr::recode(Q6_3_resp,`2`=0,`3`=0),
                      Q6_4_resp_bin = dplyr::recode(Q6_4_resp,`1`=0,`2`=1,`3`=0),
                      Q6_5_resp_bin = dplyr::recode(Q6_5_resp,`2`=0,`3`=0))

df_nv <-  readRDS("df_nyrupvinaes.rds")

##################
#### Figure 1 ####
##################

### Set up Bootstrap function (from Coppock)

# helper function to estimate sensitivity bias (with bootstrapped SEs)
estimate_sensitivity_bias <-
  function(data, alpha = 0.05, sims = 2000) {
    direct_minus_list <-
      function(data) {
        with(data, mean(D) - (mean(Y[Z == 1]) - mean(Y[Z == 0])))
      }
    
    bootstrap_distribution <-
      bootstraps(data, times = sims, apparent = TRUE) %>%
      mutate(estimate = map_dbl(splits, ~ direct_minus_list(analysis(.)))) %>%
      pull(estimate)
    
    tibble(
      estimate = direct_minus_list(data),
      std.error = sd(bootstrap_distribution),
      statistic = estimate / std.error,
      p.value = pnorm(abs(statistic), lower.tail = FALSE) * 1.96,
      conf.low = quantile(bootstrap_distribution, alpha / 1.96),
      conf.high = quantile(bootstrap_distribution, 1 - (alpha / 1.96))
    )
  }

# helper function to estimate sensitivity bias (with bootstrapped SEs)
list_boot <-
  function(data, alpha = 0.05, sims = 2000) {
    direct_minus_list <-
      function(data) {
        with(data,(mean(Y[Z == 1]) - mean(Y[Z == 0])))
      }
    
    bootstrap_distribution <-
      bootstraps(data, times = sims, apparent = TRUE) %>%
      mutate(estimate = map_dbl(splits, ~ direct_minus_list(analysis(.)))) %>%
      pull(estimate)
    
    tibble(
      estimate = direct_minus_list(data),
      std.error = sd(bootstrap_distribution),
      statistic = estimate / std.error,
      p.value = pnorm(abs(statistic), lower.tail = FALSE) * 1.96,
      conf.low = quantile(bootstrap_distribution, alpha / 1.96),
      conf.high = quantile(bootstrap_distribution, 1 - (alpha / 1.96))
    )
  }

# Format data

df_kok <- df_p %>% dplyr::rename("Z"="treatment", "Y"="Ex_Response","D"="kok") %>%
  dplyr::filter(!is.na(D) & !is.na(Y) & !is.na(Z)) %>% dplyr::select(D,Y,Z)

df_friends <- df_nv %>% dplyr::rename("Z" = "Q3_Version", "Y" = "Q3", "D" = "Q6_3_resp_bin") %>% dplyr::select(D,Y,Z) %>% as.data.frame()

df_gathering <- df_nv %>% dplyr::rename("Z" = "Q4_Version", "Y" = "Q4", "D" = "Q6_2_resp_bin") %>% dplyr::select(D,Y,Z) %>% as.data.frame()

# Create combined data

df_p_comb <- df_p %>% dplyr::select(D = kok, Y = Ex_Response, Z = Ex_Group) %>%
  mutate(Z = dplyr::recode(Z,"qExperiment_1B"=0,"qExperiment_1A"=1))

df_nv_comb_friend <- df_nv %>% dplyr::select(D = Q6_3_resp_bin, Y = Q3, Z = Q3_Version)

df_nv_comb_event <- df_nv %>% dplyr::select(D = Q6_2_resp_bin, Y = Q4, Z = Q4_Version)

df_comb <- rbind(df_p_comb,df_nv_comb_friend,df_nv_comb_event) %>% filter(!is.na(D))

### Run 

## Hugged or kissed someone outside my immediate family yesterday

# Direct estimate

direct_kok <- summarySE(df_kok,"D",conf.interval=0.95,na.rm=TRUE)

# List experiment estimate

listboot_kok <- list_boot(df_kok)

# Difference between direct and list

dif_kok <- estimate_sensitivity_bias(df_kok)

## Visit,  or get a visit from, a friend or colleague

# Direct estimate

direct_friends <- summarySE(df_friends,"D",conf.interval=0.95,na.rm=TRUE)

# List experiment estimate

listboot_friends <- list_boot(df_friends)

# Difference between direct and list

dif_friends <- estimate_sensitivity_bias(df_friends)

## Attend a gathering with a large number of people

# Direct estimate

direct_gathering <- summarySE(df_gathering,"D",conf.interval=0.95,na.rm=TRUE)

# List experiment estimate

listboot_gathering <- list_boot(df_gathering)

# Difference between direct and list

dif_gathering <- estimate_sensitivity_bias(df_gathering)

## Combined

# Direct estimate

direct_comb <- summarySE(df_comb,"D",conf.interval=0.95,na.rm=TRUE)

# List experiment estimate

listboot_comb <- list_boot(df_comb)

# Difference between direct and list

dif_comb <- estimate_sensitivity_bias(df_comb)

### Create combined dataset

stats <- tibble("group"=c(rep("kok",3),
                          rep("friend",3),
                          rep("event",3),
                          rep("comb",3)),
                "list"=c(rep(c("direct","listest","difference"),4)),
                "estimate"=c(direct_kok$D,
                             listboot_kok$estimate,dif_kok$estimate,
                             direct_friends$D,
                             listboot_friends$estimate,dif_friends$estimate,
                             direct_gathering$D,
                             listboot_gathering$estimate,dif_gathering$estimate,
                             direct_comb$D,
                             listboot_comb$estimate,dif_comb$estimate),
                "se"=c(direct_kok$se,
                       listboot_kok$std.error,dif_kok$std.error,
                       direct_friends$se,
                       listboot_friends$std.error,dif_friends$std.error,
                       direct_gathering$se,
                       listboot_gathering$std.error,dif_gathering$std.error,
                       direct_comb$se,
                       listboot_comb$std.error,dif_comb$std.error)) %>%
  mutate(ci = 1.96*se, 
         list = fct_relevel(list,"difference","listest","direct"))

#stats$group <- factor(stats$group,
#                levels = c("combined", "direct", "listest", "difference"))

### Create graph

figure1 <- ggplot(stats, aes(x=group, y=estimate, fill=list)) +
  geom_bar(position=position_dodge(.9), colour="black", stat="identity") +
  geom_errorbar(position=position_dodge(.9), width=.25, aes(ymin=estimate-ci, ymax=estimate+ci)) +
  coord_flip(ylim=c(-0.15,0.30)) + 
  scale_fill_manual(values=c("#354823","#FAD510","#273046"),breaks=c("direct", "listest","difference"),
                    labels=c("Direct question", "List-experiment","Difference in estimated non-compliance rate"),
                    name="") +
  scale_y_continuous(labels = function(x) paste0(x*100, "%")) +
  theme_bw() +
  geom_hline(yintercept=38) +
  theme(panel.border = element_blank(),
        panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        legend.text = element_text(size=11),
        axis.text=element_text(size=11, colour="black"),
        axis.title.x=element_text(size=11),
        axis.title.y=element_text(size=11,angle=0,vjust=0.6)
  ) +
  ylab("Share\nnon-compliant") + 
  xlab("") +
  #labs(title = "I de kommende dage, har du så tænkt dig at:") +
  scale_x_discrete(labels=c("Three questions \ncombined",
                            "Attend a gathering\nwith a large\nnumber of people",
                            "Visit, or get a visit\nfrom, a friend\nor colleague",
                            "Hugged or kissed\nsomeone outside my\nimmediate family yesterday"
  ))

figure1

#ggsave(
#  '../Output/figure1_updated.eps',
#  figure1,
#  width = 10,
#  height = 6,
#  dpi = 600
#)

#####################
### Design effect ###
#####################

# Hugged or kissed someone outside my immediate family yesterday

df_p <- df_p %>% filter(!is.na(Ex_Response)) # Remove NA

test_kok <- ict.test(df_p$Ex_Response,df_p$treatment,J=4,gms=TRUE,alpha = 0.05)

# Visit,  or get a visit from, a friend or colleague

test_friend <- ict.test(df_nv$Q3,df_nv$Q3_Version,J=4,gms=TRUE,alpha = 0.05)

# Attend a gathering with a large number of people

test_event <- ict.test(df_nv$Q4,df_nv$Q4_Version,J=4,gms=TRUE,alpha = 0.05)

# Combined

test_comb <- ict.test(df_comb$Y,df_comb$Z,J=4,gms=TRUE,alpha = 0.05)

#####################################
# Appendix B: Descriptive statitics #
#####################################

# Bang Petersen dataset

df_p_desc <- df_p %>% dplyr::select("bagg01.Koen","bagg02.Alder",
                                    "bagg03.Region","bagg05.Uddannelse","baggC.Beskæftigelse") %>%
  mutate(bagg05.Uddannelse = recode(bagg05.Uddannelse,"temp cate"="Gymnasie, HF, studenterkursus")) %>%
  mutate(bagg01.Koen = as.numeric(bagg01.Koen)) %>%
  mutate(bagg01.Koen = dplyr::recode(bagg01.Koen,`2`=0))
df_p_desc <- as.data.frame(df_p_desc)

df_p_desc <- cbind(df_p_desc, sapply(levels(df_p_desc$bagg03.Region), function(x) as.integer(x == df_p_desc$bagg03.Region)))
df_p_desc <- cbind(df_p_desc, sapply(levels(df_p_desc$bagg05.Uddannelse), function(x) as.integer(x == df_p_desc$bagg05.Uddannelse)))
df_p_desc <- cbind(df_p_desc, sapply(levels(df_p_desc$baggC.Beskæftigelse), function(x) as.integer(x == df_p_desc$baggC.Beskæftigelse)))

stargazer(df_p_desc, title="Descriptive statistics", digits=2,
          omit.summary.stat = c("p25","p75"),type = "html", #out="../Output/descp.html",
          covariate.labels=c("Male","Age","North Denmark Region","Central Denmark Region",
                             "Region of Southern Denmark","Capital Region of Denmark",
                             "Region Zealand","Primary school","General upper secondary education",
                             "Vocational upper secondary education",
                             "Vocational education",
                             "Short-cycle higher education",
                             "Medium-cycle higher education",
                             "Bachelor's degree","Long cycle higher education",
                             "Employed in the private sector",
                             "Employed in the public sector",
                             "Self-employed",
                             "Enrolled in education",
                             "Unemployed","Pensioner/early retirement",
                             "Other")
)

# Nyrup & Vinæs dataset

df_nv_desc <- df_nv %>% dplyr::select(bg1,alder,bg5,reg,bg8)

df_nv_desc$reg <- fct_relevel(df_nv_desc$reg,c("Nordjylland","Midtjylland","Syddanmark","Hovedstaden","Sjælland"))

df_nv_desc <- cbind(df_nv_desc, sapply(levels(df_nv_desc$reg), function(x) as.integer(x == df_nv_desc$reg)))
df_nv_desc <- cbind(df_nv_desc, sapply(levels(df_nv_desc$bg5), function(x) as.integer(x == df_nv_desc$bg5)))
df_nv_desc <- cbind(df_nv_desc, sapply(levels(df_nv_desc$bg8), function(x) as.integer(x == df_nv_desc$bg8)))

stargazer(df_nv_desc, title="Descriptive statistics", digits=2,
          omit.summary.stat = c("p25","p75"),
          type = "html", #out="../Output/descnv.html",
          covariate.labels=c("Male","Age","North Denmark Region","Central Denmark Region",
                             "Region of Southern Denmark","Capital Region of Denmark",
                             "Region Zealand","Primary school","Upper secondary education",
                             "Vocational education",
                             "Short-cycle higher education",
                             "Medium-cycle higher education","Long cycle higher education","Other education",
                             "Employed","Selfemployed","Enrolled in education","Long-term sick leave",
                             "Unemployed","Pensioner/early retirement","Other")
)

##############################
# Appendix C: Response dist. #
##############################

# Hugged or kissed someone outside my immediate family yesterday

tab_kok <- data.frame(table(df_p$Ex_Response,df_p$Ex_Group)) %>% 
  mutate(question = "Hugged or kissed", Var2 = dplyr::recode(Var2,qExperiment_1B=0,qExperiment_1A=1)) %>%
  dplyr::rename(treatment = Var2,choices = Var1)

# Visit,  or get a visit from, a friend or colleague

tab_friend <- data.frame(table(df_nv$Q3,df_nv$Q3_Version)) %>% mutate(question = "Friends") %>%
  dplyr::rename(treatment = Var2,choices = Var1)

# Attend a gathering with a large number of people

tabq_event <- data.frame(table(df_nv$Q4,df_nv$Q4_Version)) %>% mutate(question = "Event") %>%
  dplyr::rename(treatment = Var2,choices = Var1)
